export VLLM_ATTENTION_BACKEND=XFORMERS
export PORJECT_PATH=/llm/nankai/xuyang_space/project/r1_infra/agent_cdm

# export WANDB_MODE=offline
export TMPDIR=/home/xuyang/ray_tmp
# export TMPDIR=/llm/nankai/xuyang_space/ray_tmp

export ROLLOUT_TP_SIZE=2
export CUDA_VISIBLE_DEVICES=4,7
export N_GPUS=2

# export ROLLOUT_TP_SIZE=1
# export CUDA_VISIBLE_DEVICES=2
# export N_GPUS=1

# set model
export MODEL_ROOT=/llm/HFModels
export MODEL_NAME=Qwen2.5-7B
# export MODEL_NAME=Qwen2.5-7B-Instruct

# export MODEL_ROOT=/llm/nankai/xuyang_space/LLMs
# export MODEL_NAME=Qwen2.5-3B

export BASE_MODEL=$MODEL_ROOT/$MODEL_NAME

# step2
# mmlu_pro
# export BASE_MODEL=/llm/nankai/xuyang_space/project/r1_infra/agent_cdm/merge_model/mmlu_pro_best/Qwen2.5-7B-grpo-agent_cdm_25-data20250707-2048-3e-6-256-5-auto-False-False-0713

# mmlu
# export BASE_MODEL=/llm/nankai/xuyang_space/project/r1_infra/agent_cdm/merge_model/mmlu_best/Qwen2.5-7B-grpo-agent_cdm_25-data20250717_step2-2048-3e-6-256-5-auto-False-False-0718

# arc
# export BASE_MODEL=/llm/nankai/xuyang_space/project/r1_infra/agent_cdm/merge_model/arc_best/Qwen2.5-7B-grpo-agent_cdm_25-data20250719_step2-2048-3e-6-256-5-auto-False-False-0720

# step1
# mmlu
# export BASE_MODEL=/llm/nankai/xuyang_space/project/r1_infra/agent_cdm/merge_model/Qwen2.5-7B-grpo-agent_cdm_15-data202507017_step1-2048-3e-6-256-5-auto-False-True-0717-step-5

# arc
# export BASE_MODEL=/llm/nankai/xuyang_space/project/r1_infra/agent_cdm/merge_model/Qwen2.5-7B-grpo-agent_cdm_15-data20250719_step1-2048-3e-6-256-5-auto-False-True-0719


# export BASE_MODEL=/llm/nankai/xuyang_space/LLMs/Qwen2.5-3B

# export BASE_MODEL=/llm/HFModels/Qwen2.5-7B-Instruct
# export BASE_MODEL=/llm/HFModels/Llama-3-8B-Instruct
# export BASE_MODEL=/llm/nankai/xuyang_space/LLMs/Qwen2.5-1.5B-Instruct
# export BASE_MODEL=/llm/nankai/xuyang_space/LLMs/Qwen2.5-1.5B
# export BASE_MODEL=/llm/HFModels/Qwen2.5-3B-Instruct
# export BASE_MODEL=/llm/nankai/xuyang_space/project/LLaMA-Factory/saves/Qwen2.5-3B/full/sft/checkpoint-500
# export BASE_MODEL=/llm/nankai/xuyang_space/project/LLaMA-Factory/saves/Qwen2.5-3B-all_right/full/sft/checkpoint-500
# export BASE_MODEL=/llm/nankai/xuyang_space/project/LLaMA-Factory/saves/Qwen2.5-3B-all_right-0610/full/sft/checkpoint-500

# export BASE_MODEL=/llm/nankai/xuyang_space/project/LLaMA-Factory/saves/Qwen2.5-3B-all_right-0612/full/sft
# export BASE_MODEL=/llm/nankai/xuyang_space/project/LLaMA-Factory/saves/Qwen2.5-3B-all_right-0613/full/sft/checkpoint-33

# set data
export DATA_ROOT=/llm/nankai/xuyang_space/data/AM-DeepSeek-Distilled-40M/agent_cdm/processed
# export DATA_NAME=data20250703
# export DATA_NAME=data20250626
# export DATA_NAME=data20250705
# export DATA_NAME=data20250707
# export DATA_NAME=data20250708
# export DATA_NAME=data20250709
# export DATA_NAME=data20250710
# export DATA_NAME=data20250713

# mmlu_pro step2
# export DATA_NAME=data20250707

# mmlu
# export DATA_NAME=data202507017_step1
# export DATA_NAME=data20250717_step2

# arc
# export DATA_NAME=data20250719_step1
# export DATA_NAME=data20250719_step2

# vanilla
# export DATA_NAME=data20250725_arc
export DATA_NAME=data20250725_mmlu
export DATA_DIR=$DATA_ROOT/$DATA_NAME


# set reward function
export REWARD_FUNCTION=agent_cdm_6
# export REWARD_FUNCTION=agent_cdm_15
# export REWARD_FUNCTION=agent_cdm_14
# export REWARD_FUNCTION=agent_cdm_16
# export REWARD_FUNCTION=agent_cdm_18
# export REWARD_FUNCTION=agent_cdm_19
# export REWARD_FUNCTION=agent_cdm_20
# export REWARD_FUNCTION=agent_cdm_22
# export REWARD_FUNCTION=agent_cdm_23
# export REWARD_FUNCTION=agent_cdm_24

# export REWARD_FUNCTION=agent_cdm_15
# export REWARD_FUNCTION=agent_cdm_25

# export REWARD_FUNCTION=agent_cdm_20

export CUSTOM_REWARD_FUNCTION=$PORJECT_PATH/reward_score/$REWARD_FUNCTION.py


export PROJECT_NAME=AgentCDM
export LOG_PATH=$PORJECT_PATH

# export LEARNING_RATE=3e-6
export LEARNING_RATE=3e-6
export ROLLOUT_N=5
export BATCH_SIZE=256
# export BATCH_SIZE=4

export DATA_SHUFFLE=False


export MAX_RESPONSE_LENGTH=2048


# export RESUME_PATH=/llm/nankai/xuyang_space/project/r1_infra/verl/checkpoints/AgentCDM/Qwen2.5-7B-grpo-agent_cdm_25-data20250710-2048-3e-6-256-5-auto-False-False-0712/global_step_5
# export RESUME_MODE=resume_path

export RESUME_PATH=null
export RESUME_MODE=auto

# export algorithm=dapo
export algorithm=grpo
# export algorithm=rf

# export USE_KL=False
export USE_KL=True


export TODAY_DATA=0725
export EXPERIMENT_NAME=$MODEL_NAME-$algorithm-$REWARD_FUNCTION-$DATA_NAME-$MAX_RESPONSE_LENGTH-$LEARNING_RATE-$BATCH_SIZE-$ROLLOUT_N-$RESUME_MODE-$DATA_SHUFFLE-$USE_KL-$TODAY_DATA
export OUTPUT_DIR=$PORJECT_PATH/outputs/$EXPERIMENT_NAME


bash /llm/nankai/xuyang_space/project/r1_infra/agent_cdm/scripts/train_agent_cdm_$algorithm.sh
# bash /llm/nankai/xuyang_space/project/r1_infra/agent_cdm/scripts/train_agent_cdm_rf.sh